{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib as plt\n",
    "path = '/Users/linqiliang/Documents/PySci/data/pydata-book/ch02/movielens/'\n",
    "from ipykernel import kernelapp as app"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "unames = ['user_id','gender','age','occupation','zip']\n",
    "users = pd.read_table(path+'users.dat',sep='::',header = None,names=unames,engine = 'python')\n",
    "rnames = ['user_id','movie_id','rating','timestamp']\n",
    "ratings = pd.read_table(path+'ratings.dat',sep='::',header = None,names=rnames,engine = 'python')\n",
    "mnames = ['movie_id','title','genres']\n",
    "movies = pd.read_table(path+'movies.dat',sep='::',header = None,names=mnames,engine = 'python')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>M</td>\n",
       "      <td>56</td>\n",
       "      <td>16</td>\n",
       "      <td>70072</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>M</td>\n",
       "      <td>25</td>\n",
       "      <td>15</td>\n",
       "      <td>55117</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>M</td>\n",
       "      <td>45</td>\n",
       "      <td>7</td>\n",
       "      <td>02460</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id gender  age  occupation    zip\n",
       "0        1      F    1          10  48067\n",
       "1        2      M   56          16  70072\n",
       "2        3      M   25          15  55117\n",
       "3        4      M   45           7  02460"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1193</td>\n",
       "      <td>5</td>\n",
       "      <td>978300760</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>661</td>\n",
       "      <td>3</td>\n",
       "      <td>978302109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>914</td>\n",
       "      <td>3</td>\n",
       "      <td>978301968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>3408</td>\n",
       "      <td>4</td>\n",
       "      <td>978300275</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  movie_id  rating  timestamp\n",
       "0        1      1193       5  978300760\n",
       "1        1       661       3  978302109\n",
       "2        1       914       3  978301968\n",
       "3        1      3408       4  978300275"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie_id</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Animation|Children's|Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>Adventure|Children's|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Grumpier Old Men (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>Comedy|Drama</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movie_id                     title                        genres\n",
       "0         1          Toy Story (1995)   Animation|Children's|Comedy\n",
       "1         2            Jumanji (1995)  Adventure|Children's|Fantasy\n",
       "2         3   Grumpier Old Men (1995)                Comedy|Romance\n",
       "3         4  Waiting to Exhale (1995)                  Comedy|Drama"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie_id</th>\n",
       "      <th>title</th>\n",
       "      <th>gender</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1176</th>\n",
       "      <td>1193</td>\n",
       "      <td>One Flew Over the Cuckoo's Nest (1975)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      movie_id                                   title gender\n",
       "1176      1193  One Flew Over the Cuckoo's Nest (1975)  Drama"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies[movies['movie_id']==1193]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = pd.merge(ratings,users,left_on='user_id',right_on='user_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data2= pd.merge(movies,data,left_on='movie_id',right_on='movie_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  if __name__ == '__main__':\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie_id</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>user_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Animation|Children's|Comedy</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>978824268</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movie_id             title                       genres  user_id  rating  \\\n",
       "0         1  Toy Story (1995)  Animation|Children's|Comedy        1       5   \n",
       "\n",
       "   timestamp gender  age  occupation    zip  \n",
       "0  978824268      F    1          10  48067  "
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data2[data2['user_id']==1][data2['movie_id']==1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data3 = pd.merge(pd.merge(ratings,users),movies)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  if __name__ == '__main__':\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>41626</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>978824268</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Animation|Children's|Comedy</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       user_id  movie_id  rating  timestamp gender  age  occupation    zip  \\\n",
       "41626        1         1       5  978824268      F    1          10  48067   \n",
       "\n",
       "                  title                       genres  \n",
       "41626  Toy Story (1995)  Animation|Children's|Comedy  "
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data3[data3['user_id']==1][data3['movie_id']==1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "PiovtTab=pd.pivot_table(data3,values='rating',index=['movie_id','title'],columns=['gender'],aggfunc=np.mean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gender</th>\n",
       "      <th>F</th>\n",
       "      <th>M</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>movie_id</th>\n",
       "      <th>title</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <th>Toy Story (1995)</th>\n",
       "      <td>4.187817</td>\n",
       "      <td>4.130552</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>Jumanji (1995)</th>\n",
       "      <td>3.278409</td>\n",
       "      <td>3.175238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <th>Grumpier Old Men (1995)</th>\n",
       "      <td>3.073529</td>\n",
       "      <td>2.994152</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>Waiting to Exhale (1995)</th>\n",
       "      <td>2.976471</td>\n",
       "      <td>2.482353</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "gender                                    F         M\n",
       "movie_id title                                       \n",
       "1        Toy Story (1995)          4.187817  4.130552\n",
       "2        Jumanji (1995)            3.278409  3.175238\n",
       "3        Grumpier Old Men (1995)   3.073529  2.994152\n",
       "4        Waiting to Exhale (1995)  2.976471  2.482353"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PiovtTab[:4]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 筛选每个人最喜欢的电影"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1193</td>\n",
       "      <td>5</td>\n",
       "      <td>978300760</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "      <td>One Flew Over the Cuckoo's Nest (1975)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1725</th>\n",
       "      <td>1</td>\n",
       "      <td>661</td>\n",
       "      <td>3</td>\n",
       "      <td>978302109</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "      <td>James and the Giant Peach (1996)</td>\n",
       "      <td>Animation|Children's|Musical</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2250</th>\n",
       "      <td>1</td>\n",
       "      <td>914</td>\n",
       "      <td>3</td>\n",
       "      <td>978301968</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "      <td>My Fair Lady (1964)</td>\n",
       "      <td>Musical|Romance</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user_id  movie_id  rating  timestamp gender  age  occupation    zip  \\\n",
       "0           1      1193       5  978300760      F    1          10  48067   \n",
       "1725        1       661       3  978302109      F    1          10  48067   \n",
       "2250        1       914       3  978301968      F    1          10  48067   \n",
       "\n",
       "                                       title                        genres  \n",
       "0     One Flew Over the Cuckoo's Nest (1975)                         Drama  \n",
       "1725        James and the Giant Peach (1996)  Animation|Children's|Musical  \n",
       "2250                     My Fair Lady (1964)               Musical|Romance  "
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "userx = data3[data3['user_id']==1]\n",
    "userx[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n",
      "  if __name__ == '__main__':\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1193</td>\n",
       "      <td>5</td>\n",
       "      <td>978300760</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "      <td>One Flew Over the Cuckoo's Nest (1975)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50759</th>\n",
       "      <td>1</td>\n",
       "      <td>1029</td>\n",
       "      <td>5</td>\n",
       "      <td>978302205</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "      <td>Dumbo (1941)</td>\n",
       "      <td>Animation|Children's|Musical</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41626</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>978824268</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Animation|Children's|Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19503</th>\n",
       "      <td>1</td>\n",
       "      <td>3105</td>\n",
       "      <td>5</td>\n",
       "      <td>978301713</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "      <td>Awakenings (1990)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43703</th>\n",
       "      <td>1</td>\n",
       "      <td>1961</td>\n",
       "      <td>5</td>\n",
       "      <td>978301590</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "      <td>Rain Man (1988)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       user_id  movie_id  rating  timestamp gender  age  occupation    zip  \\\n",
       "0            1      1193       5  978300760      F    1          10  48067   \n",
       "50759        1      1029       5  978302205      F    1          10  48067   \n",
       "41626        1         1       5  978824268      F    1          10  48067   \n",
       "19503        1      3105       5  978301713      F    1          10  48067   \n",
       "43703        1      1961       5  978301590      F    1          10  48067   \n",
       "\n",
       "                                        title                        genres  \n",
       "0      One Flew Over the Cuckoo's Nest (1975)                         Drama  \n",
       "50759                            Dumbo (1941)  Animation|Children's|Musical  \n",
       "41626                        Toy Story (1995)   Animation|Children's|Comedy  \n",
       "19503                       Awakenings (1990)                         Drama  \n",
       "43703                         Rain Man (1988)                         Drama  "
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "userx.sort(columns=['rating'],ascending=False,axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1193</td>\n",
       "      <td>5</td>\n",
       "      <td>978300760</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "      <td>One Flew Over the Cuckoo's Nest (1975)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1193</td>\n",
       "      <td>5</td>\n",
       "      <td>978298413</td>\n",
       "      <td>M</td>\n",
       "      <td>56</td>\n",
       "      <td>16</td>\n",
       "      <td>70072</td>\n",
       "      <td>One Flew Over the Cuckoo's Nest (1975)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4202</th>\n",
       "      <td>3</td>\n",
       "      <td>2355</td>\n",
       "      <td>5</td>\n",
       "      <td>978298430</td>\n",
       "      <td>M</td>\n",
       "      <td>25</td>\n",
       "      <td>15</td>\n",
       "      <td>55117</td>\n",
       "      <td>Bug's Life, A (1998)</td>\n",
       "      <td>Animation|Children's|Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>224023</th>\n",
       "      <td>4</td>\n",
       "      <td>1240</td>\n",
       "      <td>5</td>\n",
       "      <td>978294260</td>\n",
       "      <td>M</td>\n",
       "      <td>45</td>\n",
       "      <td>7</td>\n",
       "      <td>02460</td>\n",
       "      <td>Terminator, The (1984)</td>\n",
       "      <td>Action|Sci-Fi|Thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>300678</th>\n",
       "      <td>5</td>\n",
       "      <td>1732</td>\n",
       "      <td>5</td>\n",
       "      <td>978245740</td>\n",
       "      <td>M</td>\n",
       "      <td>25</td>\n",
       "      <td>20</td>\n",
       "      <td>55455</td>\n",
       "      <td>Big Lebowski, The (1998)</td>\n",
       "      <td>Comedy|Crime|Mystery|Thriller</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        user_id  movie_id  rating  timestamp gender  age  occupation    zip  \\\n",
       "0             1      1193       5  978300760      F    1          10  48067   \n",
       "1             2      1193       5  978298413      M   56          16  70072   \n",
       "4202          3      2355       5  978298430      M   25          15  55117   \n",
       "224023        4      1240       5  978294260      M   45           7  02460   \n",
       "300678        5      1732       5  978245740      M   25          20  55455   \n",
       "\n",
       "                                         title                         genres  \n",
       "0       One Flew Over the Cuckoo's Nest (1975)                          Drama  \n",
       "1       One Flew Over the Cuckoo's Nest (1975)                          Drama  \n",
       "4202                      Bug's Life, A (1998)    Animation|Children's|Comedy  \n",
       "224023                  Terminator, The (1984)         Action|Sci-Fi|Thriller  \n",
       "300678                Big Lebowski, The (1998)  Comedy|Crime|Mystery|Thriller  "
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user = set(data3['user_id'])\n",
    "for x in user:\n",
    "    userx = data3[data3['user_id']==x]\n",
    "    if x==1:\n",
    "        tx = userx.sort_values(by=['rating'],ascending=False,axis=0)[:1]\n",
    "    else:\n",
    "        d = [tx,userx.sort_values(by=['rating'],ascending=False,axis=0)[:1]]\n",
    "        tx = pd.concat(d)\n",
    "tx[0:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 搜索评分数据大于250条的电影"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "## 下面方法已经按照评分数据量做了排序\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "movie_count= data3.groupby('movie_id').size()\n",
    "moive = movie_count.sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mean_ratings = data3.pivot_table(values='rating',index = ['title'],columns =['gender'],aggfunc = np.mean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>gender</th>\n",
       "      <th>F</th>\n",
       "      <th>M</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>title</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>$1,000,000 Duck (1971)</th>\n",
       "      <td>3.375000</td>\n",
       "      <td>2.761905</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'Night Mother (1986)</th>\n",
       "      <td>3.388889</td>\n",
       "      <td>3.352941</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'Til There Was You (1997)</th>\n",
       "      <td>2.675676</td>\n",
       "      <td>2.733333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "gender                            F         M\n",
       "title                                        \n",
       "$1,000,000 Duck (1971)     3.375000  2.761905\n",
       "'Night Mother (1986)       3.388889  3.352941\n",
       "'Til There Was You (1997)  2.675676  2.733333"
      ]
     },
     "execution_count": 147,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mean_ratings[0:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "title\n",
       "$1,000,000 Duck (1971)        37\n",
       "'Night Mother (1986)          70\n",
       "'Til There Was You (1997)     52\n",
       "'burbs, The (1989)           303\n",
       "dtype: int64"
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings_by_title = data3.groupby('title').size()\n",
    "ratings_by_title[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "active_titles = ratings_by_title.index[ratings_by_title >= 250]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)',\n",
       "       u'101 Dalmatians (1961)', u'101 Dalmatians (1996)'],\n",
       "      dtype='object', name=u'title')"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "active_titles[0:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>gender</th>\n",
       "      <th>F</th>\n",
       "      <th>M</th>\n",
       "      <th>diff</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>title</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Clean Slate (Coup de Torchon) (1981)</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.857143</td>\n",
       "      <td>1.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ballad of Narayama, The (Narayama Bushiko) (1958)</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.428571</td>\n",
       "      <td>1.571429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Raw Deal (1948)</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.307692</td>\n",
       "      <td>1.692308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bittersweet Motel (2000)</th>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Skipped Parts (2000)</th>\n",
       "      <td>5.0</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lamerica (1994)</th>\n",
       "      <td>5.0</td>\n",
       "      <td>4.666667</td>\n",
       "      <td>0.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Gambler, The (A J�t�kos) (1997)</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.166667</td>\n",
       "      <td>1.833333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Brother, Can You Spare a Dime? (1975)</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.642857</td>\n",
       "      <td>1.357143</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ayn Rand: A Sense of Life (1997)</th>\n",
       "      <td>5.0</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24 7: Twenty Four Seven (1997)</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.750000</td>\n",
       "      <td>1.250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "gender                                               F         M      diff\n",
       "title                                                                     \n",
       "Clean Slate (Coup de Torchon) (1981)               5.0  3.857143  1.142857\n",
       "Ballad of Narayama, The (Narayama Bushiko) (1958)  5.0  3.428571  1.571429\n",
       "Raw Deal (1948)                                    5.0  3.307692  1.692308\n",
       "Bittersweet Motel (2000)                           5.0       NaN       NaN\n",
       "Skipped Parts (2000)                               5.0  4.000000  1.000000\n",
       "Lamerica (1994)                                    5.0  4.666667  0.333333\n",
       "Gambler, The (A J�t�kos) (1997)                    5.0  3.166667  1.833333\n",
       "Brother, Can You Spare a Dime? (1975)              5.0  3.642857  1.357143\n",
       "Ayn Rand: A Sense of Life (1997)                   5.0  4.000000  1.000000\n",
       "24 7: Twenty Four Seven (1997)                     5.0  3.750000  1.250000"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "top_female_tatings = mean_ratings.sort_values(by='F',ascending = False)\n",
    "top_female_tatings[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "mean_ratings['diff'] = mean_ratings['F'] - mean_ratings['M']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>gender</th>\n",
       "      <th>F</th>\n",
       "      <th>M</th>\n",
       "      <th>diff</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>title</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>$1,000,000 Duck (1971)</th>\n",
       "      <td>3.375000</td>\n",
       "      <td>2.761905</td>\n",
       "      <td>0.613095</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'Night Mother (1986)</th>\n",
       "      <td>3.388889</td>\n",
       "      <td>3.352941</td>\n",
       "      <td>0.035948</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'Til There Was You (1997)</th>\n",
       "      <td>2.675676</td>\n",
       "      <td>2.733333</td>\n",
       "      <td>-0.057658</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'burbs, The (1989)</th>\n",
       "      <td>2.793478</td>\n",
       "      <td>2.962085</td>\n",
       "      <td>-0.168607</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "gender                            F         M      diff\n",
       "title                                                  \n",
       "$1,000,000 Duck (1971)     3.375000  2.761905  0.613095\n",
       "'Night Mother (1986)       3.388889  3.352941  0.035948\n",
       "'Til There Was You (1997)  2.675676  2.733333 -0.057658\n",
       "'burbs, The (1989)         2.793478  2.962085 -0.168607"
      ]
     },
     "execution_count": 155,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mean_ratings[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>gender</th>\n",
       "      <th>F</th>\n",
       "      <th>M</th>\n",
       "      <th>diff</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>title</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Tigrero: A Film That Was Never Made (1994)</th>\n",
       "      <td>1.0</td>\n",
       "      <td>4.333333</td>\n",
       "      <td>-3.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Neon Bible, The (1995)</th>\n",
       "      <td>1.0</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>-3.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Enfer, L' (1994)</th>\n",
       "      <td>1.0</td>\n",
       "      <td>3.750000</td>\n",
       "      <td>-2.750000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Stalingrad (1993)</th>\n",
       "      <td>1.0</td>\n",
       "      <td>3.593750</td>\n",
       "      <td>-2.593750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Killer: A Journal of Murder (1995)</th>\n",
       "      <td>1.0</td>\n",
       "      <td>3.428571</td>\n",
       "      <td>-2.428571</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "gender                                        F         M      diff\n",
       "title                                                              \n",
       "Tigrero: A Film That Was Never Made (1994)  1.0  4.333333 -3.333333\n",
       "Neon Bible, The (1995)                      1.0  4.000000 -3.000000\n",
       "Enfer, L' (1994)                            1.0  3.750000 -2.750000\n",
       "Stalingrad (1993)                           1.0  3.593750 -2.593750\n",
       "Killer: A Journal of Murder (1995)          1.0  3.428571 -2.428571"
      ]
     },
     "execution_count": 165,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mean_ratings.sort_values(by=['diff'],ascending = True)[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rating_std_by_title = data3.groupby('title')['rating'].std()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rating_std_by_title = rating_std_by_title.ix[active_titles]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "title\n",
       "'burbs, The (1989)                   1.107760\n",
       "10 Things I Hate About You (1999)    0.989815\n",
       "101 Dalmatians (1961)                0.982103\n",
       "101 Dalmatians (1996)                1.098717\n",
       "12 Angry Men (1957)                  0.812731\n",
       "Name: rating, dtype: float64"
      ]
     },
     "execution_count": 169,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_std_by_title[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
